# -*- coding: utf-8 -*-
"""
Spyder Editor


"""

import os
from stat import S_ISREG, ST_CTIME, ST_MODE
import numpy as np
import re
import pandas as pd
import seaborn as sbs
from matplotlib import pyplot as plt
#sbs.set()
#%%
## Change user and directory in order to run the script

user = "ba8rb2"
#user = "olive"
#C:\Users\olive\Dropbox\Andere Aufgaben\Artikel\Schubi\RAE-REF
root = "C:\\Users\\" + user + "\\Dropbox\\Andere Aufgaben\\Artikel\\Schubi\\Python\\"
path = root + "output\\"
refpath = "C:\\Users\\" + user + "Dropbox\\Andere Aufgaben\\Artikel\\Schubi\\RAE-REF\\"
graphic_output = path + "Graphics\\Heatmaps\\"

## Step 1: Generate a function that navigates through the profiles for each university
def entry_sorter(path):
    dir_path = path
    # get all entries in the directory
    entries = (os.path.join(dir_path, file_name) for file_name in os.listdir(dir_path))
    # Get their stats
    entries = ((os.stat(path), path) for path in entries)
    # leave only regular files, insert creation date
    entries = ((stat[ST_CTIME], path)
               for stat, path in entries if S_ISREG(stat[ST_MODE]))
    print(entries)
    entries = [x for x in entries]
    
    entries.sort()
    return(entries)


    


#%%
## Step 2. Load Raw-Data for further descriptive Analysis:
ref_path = "C:\\Users\\" + user + "\\Dropbox\\Andere Aufgaben\\Artikel\\Schubi\\RAE-REF\\"

df = pd.read_excel(ref_path + "Sociology_2014.xlsx","Output")    
df = df.dropna(subset=["Abstract"])
df = df.reset_index()  



#%%
## Step 3: Load Data with topic_distribution

topic_df = pd.read_excel(path + "topics\\"+ "dataframe_Topic_20.xlsx")

for x in [20,21,22]:
    del(topic_df[x])

## get Keys from data frame w. topic distribution
t_keys = [x for x in topic_df]



    
## Step 4: rename topic_keys with previously defined topics


## Get ID and Institution Names  

ids = topic_df["UKPRN"]
ids = list(set(ids))

df_institutions = pd.read_excel(ref_path + "Sociology_2014.xlsx",
                               sheet_name="Institution")
df_institutions = df_institutions[["UKPRN","Name"]]


new_inst = []
for x,y in zip(df_institutions["UKPRN"], df_institutions["Name"]):
    for z in topic_df["UKPRN"]:
        if x == str(z):
            new_inst.append(y)
        else:
            continue

topic_df["University"] = new_inst



## Step5: Get heatmap for institutions

topic_df_grouped = topic_df.groupby(by="University")
topic_df_grouped = topic_df_grouped.mean()


topic_df_grouped = topic_df_grouped.reset_index()
topic_df_grouped.to_excel(path + "2020_01_05_topics20_grouped.xlsx")




#%%
topic_df_grouped = pd.read_excel(path + "2020_01_05_topics20_grouped.xlsx")


## Step 6: Get the 20 topics:

x_topic_labels = ["Social structure analysis & quantitative methodology",
                  "Critical sociology, body sociology, abortion",
                  "Urban planning, displacement policy",
                  "Religion and dismanteling of Stereotypes",
                  "Health sociology, mental illness",
                  "Migration and Integration",
                  "Youth sociology",
                  "Crime prevention and victimology",
                  "First generation students, mental illness",
                  "Health care system and Privacy",
                  "Illicit migration, governmentality",
                  "Health inequalities in urban and rural spaces",
                  "Globalization, cosmopolisation",
                  "Gender studies with focus on cultural expression and empowerment",
                  "Political sociology, stem cell debate",
                  "Postcolonial Studies and Gender Studies",
                  "Social classes and their value systems",
                  "Technological and social evolution",
                  "Political sociology with focus on participation and political supression",
                  "Gender studies with the focus on political activism"]

## Step 7: Generate Heatmap for universities

topic_keys = [x for x in topic_df_grouped if type(x) == int]

figname = "topic_20_universities"
plt.figure(figsize=[10,21])
plt.imshow(topic_df_grouped[topic_keys])
plt.colorbar()
plt.yticks(ticks = range(len(topic_df_grouped)), labels= topic_df_grouped["University"])
plt.xticks(ticks = range(20), labels = x_topic_labels, rotation=90)

plt.tight_layout()

#plt.title(u)
plt.savefig(graphic_output + "2020_05_01_" + figname + ".png", dpi=600)
plt.close()
#


#%%

## Step 8: generate Heatmaps for each university


universities = list(set(topic_df["University"])) # generate a set of universities
for u in universities:
    df_part = topic_df[topic_df["University"] == u] # select universities
    
    ## generate figures
    figname = "05_01_2020"+"topic_20" + str(u)
    plt.figure()
    plt.imshow(df_part[topic_keys])
    plt.colorbar()
    plt.title("heatmap of topics for " + u)
    plt.tight_layout()

    plt.savefig(graphic_output + figname + ".png", dpi=600)
    plt.close()


